{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.12"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.6.12 64-bit ('py36': conda)"
  },
  "interpreter": {
   "hash": "2e2ff3a457722a20f87dbf10c05994872f65588779806bce29ecd514429d1c22"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "import pandas as pd"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "source": [
    "df_1 = pd.read_excel(r'D:\\UCAS\\Phd\\Projects\\202101文德数慧\\网络信息语料 文德 20210122.xlsx', sheet_name='测试集')\n",
    "df_0 = pd.read_csv(r'D:\\UCAS\\Phd\\Projects\\202101文德数慧\\weibo_senti_100k.csv')\n",
    "df_0 = df_0.sample(n=10000).reset_index(drop=True)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "source": [
    "data = pd.DataFrame(columns=['label','text'])\n",
    "for i in range(len(df_0)):\n",
    "    label = 0\n",
    "    text = df_0.iloc[i]['review']\n",
    "    data = data.append(pd.DataFrame({'label':[label],'text':[text]}),ignore_index=True)\n",
    "\n"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "source": [
    "data"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "     label  lable                                               text\n",
       "0      NaN    0.0    如果白云峰真的是gay,我就在也不相信国内还有不弯的型男。伤心死了。[泪][抓狂][泪][泪]\n",
       "1      NaN    0.0  看完电影，俺脑海里马上开始搜索，这是咱认识的哪位美食编辑呢？[哈哈]没敢对号入座！//@美食...\n",
       "2      NaN    0.0  在北京，想去都去不了[失望]//@悠贝亲子图书馆: @保冬妮[给力]讲座！谁要来？@alan...\n",
       "3      NaN    0.0  北京应该是没有狗肉屠宰检疫的，按销售未检疫肉类举报//@Fakuman:北京应该带头取缔狗肉...\n",
       "4      NaN    0.0  好忙，今年的年假都木时间休呢//@新西兰王熊猫: 19号么[泪]忙死了最近，肥不去了这次。咱...\n",
       "...    ...    ...                                                ...\n",
       "9995   NaN    0.0                           我是有多用心，晚上做梦都是写，写，写的！[抓狂]\n",
       "9996   NaN    0.0  #去嘉兴体验城中古镇--月河坊# 哇，我们泡个小咖啡馆，居然还是包间[给力][嘻嘻] 就是这...\n",
       "9997   NaN    0.0                       我痛心是因为我根本不震惊[衰][衰]另一场drama而已\n",
       "9998   NaN    0.0  #高德地图哪都熟#高德地图广告上刊啦！（碰巧和老东家亚马逊的碰到一起[嘻嘻]）。哪位童鞋有看...\n",
       "9999   NaN    0.0  回复@晴儿ff的微博:突然想起来，上午还有人来装马桶，下午还有人来修水槽[泪]，周末啊 //...\n",
       "\n",
       "[10000 rows x 3 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>lable</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>如果白云峰真的是gay,我就在也不相信国内还有不弯的型男。伤心死了。[泪][抓狂][泪][泪]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>看完电影，俺脑海里马上开始搜索，这是咱认识的哪位美食编辑呢？[哈哈]没敢对号入座！//@美食...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>在北京，想去都去不了[失望]//@悠贝亲子图书馆: @保冬妮[给力]讲座！谁要来？@alan...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>北京应该是没有狗肉屠宰检疫的，按销售未检疫肉类举报//@Fakuman:北京应该带头取缔狗肉...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>好忙，今年的年假都木时间休呢//@新西兰王熊猫: 19号么[泪]忙死了最近，肥不去了这次。咱...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9995</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>我是有多用心，晚上做梦都是写，写，写的！[抓狂]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9996</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>#去嘉兴体验城中古镇--月河坊# 哇，我们泡个小咖啡馆，居然还是包间[给力][嘻嘻] 就是这...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9997</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>我痛心是因为我根本不震惊[衰][衰]另一场drama而已</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9998</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>#高德地图哪都熟#高德地图广告上刊啦！（碰巧和老东家亚马逊的碰到一起[嘻嘻]）。哪位童鞋有看...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>回复@晴儿ff的微博:突然想起来，上午还有人来装马桶，下午还有人来修水槽[泪]，周末啊 //...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 3 columns</p>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "execution_count": 12
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "source": [
    "df_1 = pd.read_excel(r'D:\\UCAS\\Phd\\Projects\\202101文德数慧\\网络信息语料 文德 20210122.xlsx', sheet_name='测试集')\n",
    "df_0 = pd.read_csv(r'D:\\UCAS\\Phd\\Projects\\202101文德数慧\\weibo_senti_100k.csv')\n",
    "df_0 = df_0.sample(n=10000).reset_index(drop=True)\n",
    "data = pd.DataFrame(columns=['label','text'])\n",
    "\n",
    "for i in range(len(df_0)):\n",
    "    label = 0\n",
    "    text = df_0.iloc[i]['review']\n",
    "    data = data.append(pd.DataFrame({'label':[label],'text':[text]}),ignore_index=True)\n",
    "\n",
    "for i in range(len(df_1)):\n",
    "    label = 1\n",
    "    text = df_1.iloc[i]['内容']\n",
    "    data = data.append(pd.DataFrame({'label':[label],'text':[text]}),ignore_index=True)\n",
    "\n",
    "data['label'] = data['label'].astype(int)\n",
    "data = data.sample(frac=1).reset_index(drop=True)\n",
    "data_test = data[:2000]\n",
    "data_train = data[2000:]\n",
    "\n",
    "data_train.to_csv(r\"D:\\UCAS\\Phd\\Projects\\202101文德数慧\\train.tsv\",sep='\\t',header=False,index=False)\n",
    "data_test.to_csv(r\"D:\\UCAS\\Phd\\Projects\\202101文德数慧\\test.tsv\",sep='\\t',header=False,index=False)\n",
    "data_test.to_csv(r\"D:\\UCAS\\Phd\\Projects\\202101文德数慧\\dev.tsv\",sep='\\t',header=False,index=False)\n",
    "\n"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 第二批文德精加工数据筛选"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "import pandas as pd"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "source": [
    "df_1 = pd.read_csv('/data/leo/Work/Wende/弹幕数据抓取/0525_0628/result_0525_0628_sq.csv')"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "source": [
    "df_2 = pd.read_csv('/data/leo/Work/Wende/弹幕数据抓取/0525_0628/result_0525_0628_ds.csv')"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "source": [
    "df = pd.merge(df_1,df_2)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "source": [
    "df"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "              用户昵称                           弹幕内容  \\\n",
       "0         小呆【智贤场控】  欢迎新来的哥哥们 江苏南京跳舞主播 喜欢的点点订阅哦~~~   \n",
       "1         8神月粉丝刘志华                         老婆今天好美   \n",
       "2         小呆【智贤场控】                    羽欢哥 来了，欢迎欢迎   \n",
       "3             夜神疾风                         我喜欢旗袍的   \n",
       "4         小呆【智贤场控】        送出一组（10根荧光棒）可以卡智贤的徽章哦~~   \n",
       "...            ...                            ...   \n",
       "737540         雅丽糕                         我用刀劈开了   \n",
       "737541         雅丽糕                    就像劈男人那样哈哈哈哈   \n",
       "737542         雅丽糕                       完了 展露本性了   \n",
       "737543  小猪ZzzZ【23】                       打你干嘛 直接劈   \n",
       "737544  小猪ZzzZ【23】             你说下播 我说去吧 我好像都说3遍了   \n",
       "\n",
       "                              弹幕内容_清洗  result_sq_v1  score_sq_v1  \\\n",
       "0       欢迎新来的哥哥们 江苏南京跳舞主播 喜欢的点点订阅哦~~~             0     0.999737   \n",
       "1                              老婆今天好美             0     0.999799   \n",
       "2                         羽欢哥 来了，欢迎欢迎             0     0.999775   \n",
       "3                              我喜欢旗袍的             0     0.999768   \n",
       "4             送出一组（10根荧光棒）可以卡智贤的徽章哦~~             0     0.999635   \n",
       "...                               ...           ...          ...   \n",
       "737540                         我用刀劈开了             0     0.999511   \n",
       "737541                    就像劈男人那样哈哈哈哈             0     0.999780   \n",
       "737542                       完了 展露本性了             0     0.999821   \n",
       "737543                       打你干嘛 直接劈             0     0.998826   \n",
       "737544             你说下播 我说去吧 我好像都说3遍了             0     0.999752   \n",
       "\n",
       "        result_ds_v1  score_ds_v1  \n",
       "0                  0     0.996277  \n",
       "1                  0     0.993345  \n",
       "2                  0     0.997224  \n",
       "3                  0     0.997145  \n",
       "4                  0     0.805555  \n",
       "...              ...          ...  \n",
       "737540             0     0.539265  \n",
       "737541             0     0.907276  \n",
       "737542             0     0.852174  \n",
       "737543             1     0.977959  \n",
       "737544             0     0.994127  \n",
       "\n",
       "[737545 rows x 7 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户昵称</th>\n",
       "      <th>弹幕内容</th>\n",
       "      <th>弹幕内容_清洗</th>\n",
       "      <th>result_sq_v1</th>\n",
       "      <th>score_sq_v1</th>\n",
       "      <th>result_ds_v1</th>\n",
       "      <th>score_ds_v1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>小呆【智贤场控】</td>\n",
       "      <td>欢迎新来的哥哥们 江苏南京跳舞主播 喜欢的点点订阅哦~~~</td>\n",
       "      <td>欢迎新来的哥哥们 江苏南京跳舞主播 喜欢的点点订阅哦~~~</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999737</td>\n",
       "      <td>0</td>\n",
       "      <td>0.996277</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8神月粉丝刘志华</td>\n",
       "      <td>老婆今天好美</td>\n",
       "      <td>老婆今天好美</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999799</td>\n",
       "      <td>0</td>\n",
       "      <td>0.993345</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>小呆【智贤场控】</td>\n",
       "      <td>羽欢哥 来了，欢迎欢迎</td>\n",
       "      <td>羽欢哥 来了，欢迎欢迎</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999775</td>\n",
       "      <td>0</td>\n",
       "      <td>0.997224</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>夜神疾风</td>\n",
       "      <td>我喜欢旗袍的</td>\n",
       "      <td>我喜欢旗袍的</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999768</td>\n",
       "      <td>0</td>\n",
       "      <td>0.997145</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>小呆【智贤场控】</td>\n",
       "      <td>送出一组（10根荧光棒）可以卡智贤的徽章哦~~</td>\n",
       "      <td>送出一组（10根荧光棒）可以卡智贤的徽章哦~~</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999635</td>\n",
       "      <td>0</td>\n",
       "      <td>0.805555</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>737540</th>\n",
       "      <td>雅丽糕</td>\n",
       "      <td>我用刀劈开了</td>\n",
       "      <td>我用刀劈开了</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999511</td>\n",
       "      <td>0</td>\n",
       "      <td>0.539265</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>737541</th>\n",
       "      <td>雅丽糕</td>\n",
       "      <td>就像劈男人那样哈哈哈哈</td>\n",
       "      <td>就像劈男人那样哈哈哈哈</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999780</td>\n",
       "      <td>0</td>\n",
       "      <td>0.907276</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>737542</th>\n",
       "      <td>雅丽糕</td>\n",
       "      <td>完了 展露本性了</td>\n",
       "      <td>完了 展露本性了</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999821</td>\n",
       "      <td>0</td>\n",
       "      <td>0.852174</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>737543</th>\n",
       "      <td>小猪ZzzZ【23】</td>\n",
       "      <td>打你干嘛 直接劈</td>\n",
       "      <td>打你干嘛 直接劈</td>\n",
       "      <td>0</td>\n",
       "      <td>0.998826</td>\n",
       "      <td>1</td>\n",
       "      <td>0.977959</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>737544</th>\n",
       "      <td>小猪ZzzZ【23】</td>\n",
       "      <td>你说下播 我说去吧 我好像都说3遍了</td>\n",
       "      <td>你说下播 我说去吧 我好像都说3遍了</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999752</td>\n",
       "      <td>0</td>\n",
       "      <td>0.994127</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>737545 rows × 7 columns</p>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "execution_count": 14
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "### sq ds 合并"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "source": [
    "from tqdm import tqdm\n",
    "for i in tqdm(range(len(df))):\n",
    "    if df.iloc[i]['result_sq_v1'] == 1 and df.iloc[i]['result_ds_v1'] == 1:\n",
    "        df.loc[i,\"result_ds_v1\"] = 2"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "100%|██████████| 737545/737545 [02:00<00:00, 6139.37it/s]\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# df.to_csv('/data/leo/Work/Wende/弹幕数据抓取/0525_0628/result_0525_0628_merge.csv',index=False,encoding='utf_8_sig')\n",
    "df.to_excel('/data/leo/Work/Wende/弹幕数据抓取/0525_0628/result_0525_0628_merge.xlsx',engine='xlsxwriter')"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 筛选sq数据约3000条 (当前大概只用了前面3000-4000条)"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "source": [
    "df_sq = df[(df['result_sq_v1'] == 1) & (df['score_sq_v1'] > 0.7)].reset_index(drop=True)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "source": [
    "df_sq"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "            用户昵称             弹幕内容          弹幕内容_清洗  result_sq_v1  score_sq_v1  \\\n",
       "0           感冒是病             啊 你妹             啊 你妹             1     0.729628   \n",
       "1           Tmsg  四夷重译称天子，否极泰来九国春  四夷重译称天子，否极泰来九国春             1     0.896735   \n",
       "2             蜜茶               浑圆               浑圆             1     0.743499   \n",
       "3           刘大昊昊              好嫩啊              好嫩啊             1     0.980578   \n",
       "4           卟爱别爱             倒挂金钩             倒挂金钩             1     0.701661   \n",
       "...          ...              ...              ...           ...          ...   \n",
       "6279       李哥儿粉丝             露jio             露jio             1     0.973736   \n",
       "6280       GLO菜菜     看看jiao看看jiao     看看jiao看看jiao             1     0.951069   \n",
       "6281      气质气质莫总            秦诗杨快射            秦诗杨快射             1     0.996730   \n",
       "6282     冷落人生343             漏肉表演             漏肉表演             1     0.994746   \n",
       "6283  超级玛丽【独宠琉璃】           在G2那聊天           在G2那聊天             1     0.981914   \n",
       "\n",
       "      result_ds_v1  score_ds_v1  \n",
       "0                0     0.914148  \n",
       "1                0     0.996132  \n",
       "2                0     0.989762  \n",
       "3                0     0.985516  \n",
       "4                2     0.951298  \n",
       "...            ...          ...  \n",
       "6279             2     0.790449  \n",
       "6280             0     0.994392  \n",
       "6281             2     0.989274  \n",
       "6282             2     0.965516  \n",
       "6283             0     0.997476  \n",
       "\n",
       "[6284 rows x 7 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户昵称</th>\n",
       "      <th>弹幕内容</th>\n",
       "      <th>弹幕内容_清洗</th>\n",
       "      <th>result_sq_v1</th>\n",
       "      <th>score_sq_v1</th>\n",
       "      <th>result_ds_v1</th>\n",
       "      <th>score_ds_v1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>感冒是病</td>\n",
       "      <td>啊 你妹</td>\n",
       "      <td>啊 你妹</td>\n",
       "      <td>1</td>\n",
       "      <td>0.729628</td>\n",
       "      <td>0</td>\n",
       "      <td>0.914148</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Tmsg</td>\n",
       "      <td>四夷重译称天子，否极泰来九国春</td>\n",
       "      <td>四夷重译称天子，否极泰来九国春</td>\n",
       "      <td>1</td>\n",
       "      <td>0.896735</td>\n",
       "      <td>0</td>\n",
       "      <td>0.996132</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>蜜茶</td>\n",
       "      <td>浑圆</td>\n",
       "      <td>浑圆</td>\n",
       "      <td>1</td>\n",
       "      <td>0.743499</td>\n",
       "      <td>0</td>\n",
       "      <td>0.989762</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>刘大昊昊</td>\n",
       "      <td>好嫩啊</td>\n",
       "      <td>好嫩啊</td>\n",
       "      <td>1</td>\n",
       "      <td>0.980578</td>\n",
       "      <td>0</td>\n",
       "      <td>0.985516</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>卟爱别爱</td>\n",
       "      <td>倒挂金钩</td>\n",
       "      <td>倒挂金钩</td>\n",
       "      <td>1</td>\n",
       "      <td>0.701661</td>\n",
       "      <td>2</td>\n",
       "      <td>0.951298</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6279</th>\n",
       "      <td>李哥儿粉丝</td>\n",
       "      <td>露jio</td>\n",
       "      <td>露jio</td>\n",
       "      <td>1</td>\n",
       "      <td>0.973736</td>\n",
       "      <td>2</td>\n",
       "      <td>0.790449</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6280</th>\n",
       "      <td>GLO菜菜</td>\n",
       "      <td>看看jiao看看jiao</td>\n",
       "      <td>看看jiao看看jiao</td>\n",
       "      <td>1</td>\n",
       "      <td>0.951069</td>\n",
       "      <td>0</td>\n",
       "      <td>0.994392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6281</th>\n",
       "      <td>气质气质莫总</td>\n",
       "      <td>秦诗杨快射</td>\n",
       "      <td>秦诗杨快射</td>\n",
       "      <td>1</td>\n",
       "      <td>0.996730</td>\n",
       "      <td>2</td>\n",
       "      <td>0.989274</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6282</th>\n",
       "      <td>冷落人生343</td>\n",
       "      <td>漏肉表演</td>\n",
       "      <td>漏肉表演</td>\n",
       "      <td>1</td>\n",
       "      <td>0.994746</td>\n",
       "      <td>2</td>\n",
       "      <td>0.965516</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6283</th>\n",
       "      <td>超级玛丽【独宠琉璃】</td>\n",
       "      <td>在G2那聊天</td>\n",
       "      <td>在G2那聊天</td>\n",
       "      <td>1</td>\n",
       "      <td>0.981914</td>\n",
       "      <td>0</td>\n",
       "      <td>0.997476</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6284 rows × 7 columns</p>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "execution_count": 53
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "source": [
    "### 排序\n",
    "df_sq.sort_values(by='score_sq_v1',inplace=True,ascending=False)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "source": [
    "df_sq"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "            用户昵称             弹幕内容          弹幕内容_清洗  result_sq_v1  score_sq_v1  \\\n",
       "4914         纯散人             无偿送片             无偿送片             1     0.999679   \n",
       "2424         离魄魂             欧美激情             欧美激情             1     0.999671   \n",
       "5281      勉强18cn             欧美无码             欧美无码             1     0.999669   \n",
       "4801  AA岸似绿岸似透春绿             激情对射             激情对射             1     0.999668   \n",
       "4228       乄打伞的鱼             颜色视频             颜色视频             1     0.999667   \n",
       "...          ...              ...              ...           ...          ...   \n",
       "6115       菠萝蜜橙子         山东菏泽曹县牛批         山东菏泽曹县牛批             1     0.700532   \n",
       "106        回火的木棍              别乱射              别乱射             1     0.700499   \n",
       "678    即将不是光头滴李爽  喜羊羊与灰太狼之古古怪界大作战  喜羊羊与灰太狼之古古怪界大作战             1     0.700440   \n",
       "2264    不是山谷-Sun               发你               发你             1     0.700084   \n",
       "304   放肆的笑゜ 那般凄美              5指山              5指山             1     0.700042   \n",
       "\n",
       "      result_ds_v1  score_ds_v1  \n",
       "4914             2     0.899581  \n",
       "2424             0     0.905589  \n",
       "5281             0     0.689814  \n",
       "4801             2     0.989513  \n",
       "4228             2     0.758837  \n",
       "...            ...          ...  \n",
       "6115             0     0.926102  \n",
       "106              2     0.989902  \n",
       "678              0     0.996353  \n",
       "2264             0     0.818975  \n",
       "304              0     0.995170  \n",
       "\n",
       "[6284 rows x 7 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户昵称</th>\n",
       "      <th>弹幕内容</th>\n",
       "      <th>弹幕内容_清洗</th>\n",
       "      <th>result_sq_v1</th>\n",
       "      <th>score_sq_v1</th>\n",
       "      <th>result_ds_v1</th>\n",
       "      <th>score_ds_v1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4914</th>\n",
       "      <td>纯散人</td>\n",
       "      <td>无偿送片</td>\n",
       "      <td>无偿送片</td>\n",
       "      <td>1</td>\n",
       "      <td>0.999679</td>\n",
       "      <td>2</td>\n",
       "      <td>0.899581</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2424</th>\n",
       "      <td>离魄魂</td>\n",
       "      <td>欧美激情</td>\n",
       "      <td>欧美激情</td>\n",
       "      <td>1</td>\n",
       "      <td>0.999671</td>\n",
       "      <td>0</td>\n",
       "      <td>0.905589</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5281</th>\n",
       "      <td>勉强18cn</td>\n",
       "      <td>欧美无码</td>\n",
       "      <td>欧美无码</td>\n",
       "      <td>1</td>\n",
       "      <td>0.999669</td>\n",
       "      <td>0</td>\n",
       "      <td>0.689814</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4801</th>\n",
       "      <td>AA岸似绿岸似透春绿</td>\n",
       "      <td>激情对射</td>\n",
       "      <td>激情对射</td>\n",
       "      <td>1</td>\n",
       "      <td>0.999668</td>\n",
       "      <td>2</td>\n",
       "      <td>0.989513</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4228</th>\n",
       "      <td>乄打伞的鱼</td>\n",
       "      <td>颜色视频</td>\n",
       "      <td>颜色视频</td>\n",
       "      <td>1</td>\n",
       "      <td>0.999667</td>\n",
       "      <td>2</td>\n",
       "      <td>0.758837</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6115</th>\n",
       "      <td>菠萝蜜橙子</td>\n",
       "      <td>山东菏泽曹县牛批</td>\n",
       "      <td>山东菏泽曹县牛批</td>\n",
       "      <td>1</td>\n",
       "      <td>0.700532</td>\n",
       "      <td>0</td>\n",
       "      <td>0.926102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>回火的木棍</td>\n",
       "      <td>别乱射</td>\n",
       "      <td>别乱射</td>\n",
       "      <td>1</td>\n",
       "      <td>0.700499</td>\n",
       "      <td>2</td>\n",
       "      <td>0.989902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>678</th>\n",
       "      <td>即将不是光头滴李爽</td>\n",
       "      <td>喜羊羊与灰太狼之古古怪界大作战</td>\n",
       "      <td>喜羊羊与灰太狼之古古怪界大作战</td>\n",
       "      <td>1</td>\n",
       "      <td>0.700440</td>\n",
       "      <td>0</td>\n",
       "      <td>0.996353</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2264</th>\n",
       "      <td>不是山谷-Sun</td>\n",
       "      <td>发你</td>\n",
       "      <td>发你</td>\n",
       "      <td>1</td>\n",
       "      <td>0.700084</td>\n",
       "      <td>0</td>\n",
       "      <td>0.818975</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>304</th>\n",
       "      <td>放肆的笑゜ 那般凄美</td>\n",
       "      <td>5指山</td>\n",
       "      <td>5指山</td>\n",
       "      <td>1</td>\n",
       "      <td>0.700042</td>\n",
       "      <td>0</td>\n",
       "      <td>0.995170</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6284 rows × 7 columns</p>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "execution_count": 55
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "source": [
    "select_list = []\n",
    "select_list_no_whitesapce = []\n",
    "count = 0\n",
    "for i in range(len(df_sq)):\n",
    "    text = df_sq.iloc[i]['弹幕内容_清洗']\n",
    "    if text.replace(' ','') not in select_list_no_whitesapce:\n",
    "        select_list.append(text)\n",
    "        count += 1\n",
    "    select_list_no_whitesapce.append(text.replace(' ',''))\n",
    "print(count)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "6266\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "source": [
    "with open('/data/leo/Work/Wende/弹幕数据抓取/0525_0628/sq_select_3000.txt','w') as f:\n",
    "    for text in select_list[:4000]:\n",
    "        f.write(text+'\\n')\n"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "df[df[]]"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 筛选ds数据约3000条 "
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "source": [
    "df_ds = df[(df['result_ds_v1'] == 1) & (df['score_ds_v1'] > 0.8)].reset_index(drop=True)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "source": [
    "### 排序\n",
    "df_ds.sort_values(by='score_ds_v1',inplace=True,ascending=False)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "source": [
    "df_ds"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "            用户昵称            弹幕内容         弹幕内容_清洗  result_sq_v1  score_sq_v1  \\\n",
       "35440  德玛西亚万岁587             射2亿             射2亿             0     0.812302   \n",
       "41443   塔寨村房头林耀东           一天橹8次           一天橹8次             0     0.995256   \n",
       "14462     绿皮的西瓜猪           宽好生孩子           宽好生孩子             0     0.998944   \n",
       "26621     AA丶无聊猫             互射？             互射？             0     0.998179   \n",
       "6921    1900刘HoO           重新干一遍           重新干一遍             0     0.998518   \n",
       "...          ...             ...             ...           ...          ...   \n",
       "45922      爱尔小帅哥     原来周周是男人  冲了     原来周周是男人  冲了             0     0.999770   \n",
       "41132    请求狙击张十三           这个咬手指           这个咬手指             0     0.999597   \n",
       "43820         哥哥              要你              要你             0     0.816193   \n",
       "33125       子狱丶虢  你不他打星期日不是也可以吗？  你不他打星期日不是也可以吗？             0     0.997847   \n",
       "34514    好想舔学生妹汁        大大大  弹弹弹        大大大  弹弹弹             0     0.999655   \n",
       "\n",
       "       result_ds_v1  score_ds_v1  \n",
       "35440             1     0.990910  \n",
       "41443             1     0.990779  \n",
       "14462             1     0.990740  \n",
       "26621             1     0.990626  \n",
       "6921              1     0.990591  \n",
       "...             ...          ...  \n",
       "45922             1     0.800084  \n",
       "41132             1     0.800084  \n",
       "43820             1     0.800081  \n",
       "33125             1     0.800075  \n",
       "34514             1     0.800073  \n",
       "\n",
       "[54331 rows x 7 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>用户昵称</th>\n",
       "      <th>弹幕内容</th>\n",
       "      <th>弹幕内容_清洗</th>\n",
       "      <th>result_sq_v1</th>\n",
       "      <th>score_sq_v1</th>\n",
       "      <th>result_ds_v1</th>\n",
       "      <th>score_ds_v1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>35440</th>\n",
       "      <td>德玛西亚万岁587</td>\n",
       "      <td>射2亿</td>\n",
       "      <td>射2亿</td>\n",
       "      <td>0</td>\n",
       "      <td>0.812302</td>\n",
       "      <td>1</td>\n",
       "      <td>0.990910</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41443</th>\n",
       "      <td>塔寨村房头林耀东</td>\n",
       "      <td>一天橹8次</td>\n",
       "      <td>一天橹8次</td>\n",
       "      <td>0</td>\n",
       "      <td>0.995256</td>\n",
       "      <td>1</td>\n",
       "      <td>0.990779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14462</th>\n",
       "      <td>绿皮的西瓜猪</td>\n",
       "      <td>宽好生孩子</td>\n",
       "      <td>宽好生孩子</td>\n",
       "      <td>0</td>\n",
       "      <td>0.998944</td>\n",
       "      <td>1</td>\n",
       "      <td>0.990740</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26621</th>\n",
       "      <td>AA丶无聊猫</td>\n",
       "      <td>互射？</td>\n",
       "      <td>互射？</td>\n",
       "      <td>0</td>\n",
       "      <td>0.998179</td>\n",
       "      <td>1</td>\n",
       "      <td>0.990626</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6921</th>\n",
       "      <td>1900刘HoO</td>\n",
       "      <td>重新干一遍</td>\n",
       "      <td>重新干一遍</td>\n",
       "      <td>0</td>\n",
       "      <td>0.998518</td>\n",
       "      <td>1</td>\n",
       "      <td>0.990591</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45922</th>\n",
       "      <td>爱尔小帅哥</td>\n",
       "      <td>原来周周是男人  冲了</td>\n",
       "      <td>原来周周是男人  冲了</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999770</td>\n",
       "      <td>1</td>\n",
       "      <td>0.800084</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41132</th>\n",
       "      <td>请求狙击张十三</td>\n",
       "      <td>这个咬手指</td>\n",
       "      <td>这个咬手指</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999597</td>\n",
       "      <td>1</td>\n",
       "      <td>0.800084</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43820</th>\n",
       "      <td>哥哥</td>\n",
       "      <td>要你</td>\n",
       "      <td>要你</td>\n",
       "      <td>0</td>\n",
       "      <td>0.816193</td>\n",
       "      <td>1</td>\n",
       "      <td>0.800081</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33125</th>\n",
       "      <td>子狱丶虢</td>\n",
       "      <td>你不他打星期日不是也可以吗？</td>\n",
       "      <td>你不他打星期日不是也可以吗？</td>\n",
       "      <td>0</td>\n",
       "      <td>0.997847</td>\n",
       "      <td>1</td>\n",
       "      <td>0.800075</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34514</th>\n",
       "      <td>好想舔学生妹汁</td>\n",
       "      <td>大大大  弹弹弹</td>\n",
       "      <td>大大大  弹弹弹</td>\n",
       "      <td>0</td>\n",
       "      <td>0.999655</td>\n",
       "      <td>1</td>\n",
       "      <td>0.800073</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>54331 rows × 7 columns</p>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "execution_count": 66
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "source": [
    "select_list = []\n",
    "select_list_no_whitesapce = []\n",
    "count = 0\n",
    "for i in range(4000):\n",
    "    text = df_ds.iloc[i]['弹幕内容_清洗']\n",
    "    if text.replace(' ','') not in select_list_no_whitesapce:\n",
    "        select_list.append(text)\n",
    "        count += 1\n",
    "    select_list_no_whitesapce.append(text.replace(' ',''))\n",
    "print(count)\n",
    "with open('/data/leo/Work/Wende/弹幕数据抓取/0525_0628/ds_select_3000.txt','w') as f:\n",
    "    for text in select_list:\n",
    "        f.write(text+'\\n')\n"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3982\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 汇总全部待标注弹幕和筛选后的弹幕"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "source": [
    "with open('/data/leo/Work/Wende/弹幕数据抓取/0525_0628/ds_select_3000.txt','r') as f:\n",
    "    ds_select_3000 = [line.strip() for line in f.readlines()]\n",
    "with open('/data/leo/Work/Wende/弹幕数据抓取/0525_0628/sq_select_3000.txt','r') as f:\n",
    "    sq_select_3000 = [line.strip() for line in f.readlines()]"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "source": [
    "##清除重复部分\n",
    "df['弹幕内容_清洗_'] = df_danmu['弹幕内容_清洗'].str.replace(' ','')"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "source": [
    "df.drop_duplicates('弹幕内容_清洗_',inplace=True)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "source": [
    "dammu_all = df['弹幕内容_清洗'].to_list()"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "source": [
    "danmu_left = []\n",
    "for d in danmu_all:\n",
    "    if d not in ds_select_3000:\n",
    "        if d not in sq_select_3000:\n",
    "            danmu_left.append(d)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "source": [
    "len(danmu_left)"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "729563"
      ]
     },
     "metadata": {},
     "execution_count": 32
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "source": [
    "df_danmu_left = pd.DataFrame({'弹幕内容_清洗':danmu_left})"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "source": [
    "df_danmu_left"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "                              弹幕内容_清洗\n",
       "0       欢迎新来的哥哥们 江苏南京跳舞主播 喜欢的点点订阅哦~~~\n",
       "1                              老婆今天好美\n",
       "2                         羽欢哥 来了，欢迎欢迎\n",
       "3                              我喜欢旗袍的\n",
       "4             送出一组（10根荧光棒）可以卡智贤的徽章哦~~\n",
       "...                               ...\n",
       "729558                         我用刀劈开了\n",
       "729559                    就像劈男人那样哈哈哈哈\n",
       "729560                       完了 展露本性了\n",
       "729561                       打你干嘛 直接劈\n",
       "729562             你说下播 我说去吧 我好像都说3遍了\n",
       "\n",
       "[729563 rows x 1 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>弹幕内容_清洗</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>欢迎新来的哥哥们 江苏南京跳舞主播 喜欢的点点订阅哦~~~</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>老婆今天好美</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>羽欢哥 来了，欢迎欢迎</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>我喜欢旗袍的</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>送出一组（10根荧光棒）可以卡智贤的徽章哦~~</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>729558</th>\n",
       "      <td>我用刀劈开了</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>729559</th>\n",
       "      <td>就像劈男人那样哈哈哈哈</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>729560</th>\n",
       "      <td>完了 展露本性了</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>729561</th>\n",
       "      <td>打你干嘛 直接劈</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>729562</th>\n",
       "      <td>你说下播 我说去吧 我好像都说3遍了</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>729563 rows × 1 columns</p>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "execution_count": 34
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "source": [
    "df_danmu_left.to_csv('/data/leo/Work/Wende/弹幕数据抓取/0525_0628/danmu_to_anno_0714.csv')"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import tensorflow as tf\n",
    "from tensorflow.python.client import device_lib\n",
    " \n",
    "gpu_device_name = tf.test.gpu_device_name()\n",
    "print(gpu_device_name)\n",
    "print(tf.test.is_gpu_available())\n",
    "local_device_protos = device_lib.list_local_devices()\n",
    "[print(x) for x in local_device_protos if x.device_type == 'GPU']\n"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "source": [
    "import tensorflow as tf\n",
    "physical_devices = tf.config.list_physical_devices('GPU') \n",
    "tf.config.experimental.set_memory_growth(physical_devices[0], True)"
   ],
   "outputs": [
    {
     "output_type": "error",
     "ename": "AttributeError",
     "evalue": "module 'tensorflow' has no attribute 'config'",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-7-6f4d4813b737>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mphysical_devices\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlist_physical_devices\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'GPU'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexperimental\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_memory_growth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mphysical_devices\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: module 'tensorflow' has no attribute 'config'"
     ]
    }
   ],
   "metadata": {}
  }
 ]
}